保存并重新加载
Google 账号
IRIS ZHANG
yuxianz77@gmail.com
笔记本
代码 文本

Gemini
# # Run this unit first, restart the session and change this unit to comment, then run the other units
# !pip cache purge
# !pip uninstall -y gensim numpy scipy
# # Since the latest versions of gensim, numpy, and scipy are not compatible, it took many tries to find a non-conflicting version as follows:
# !pip install gensim==4.3.3 numpy==1.25.2 scipy==1.9.3
代码 文本

Gemini
Default GPU Device: /device:GPU:0
代码 文本

Gemini
import pandas as pd
import requests
import time
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
#!pip install gensim
import gensim
import gensim.corpora as corpora
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS
from gensim.models import CoherenceModel
!pip install pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
from wordcloud import WordCloud
!pip install ollama
!sudo apt update
!sudo apt install -y pciutils
!curl -fsSL https://ollama.com/install.sh | sh
!ollama pull deepseek-r1:14b
import ollama
import threading
import subprocess
import time
显示隐藏的输出项
代码 文本

Gemini
# Download required data
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('words')
显示隐藏的输出项
代码 文本

Gemini
# Define functions for cleaning text
def clean_text(text):
    text = re.sub(r'[^\w\s]''', text)  # remove punctuation
    text = text.lower()  # convert to lowercase
    tokens = word_tokenize(text)  # participle
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]  # disjunction
    return ' '.join(filtered_tokens)  # Returns cleaned string (separate for each comment)


# Morphological reduction function
def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

# YouTube comment collection function
def youtube_search_comments(pagesvideo_idyourtoken):
    endpoint_search = "https://www.googleapis.com/youtube/v3/commentThreads"
    df = pd.DataFrame()
    # parameter setting
    parameters = {
        "part""id,replies,snippet",
        "order""relevance",
        "maxResults"100,
        "videoId": video_id,
        "key": yourtoken
    }
    # We need to crawl more pages
    page_counter = 0
    while page_counter < pages:
        response = requests.get(endpoint_search, params=parameters)
        response_items = response.json()['items']
        for item in response_items:
            r_id = item['id']
            r_2nd = item['snippet']['topLevelComment']['snippet']
            new_row = pd.DataFrame([[r_id, r_2nd['publishedAt'], r_2nd['videoId'], r_2nd['textDisplay'],
                                     r_2nd['authorDisplayName'], r_2nd['likeCount']]],
                                   columns=["CommentId""publishedAt""videoId""textDisplay",
                                            "authorDisplayName""likeCount"])
            df = pd.concat([df, new_row], ignore_index=True)
        if 'nextPageToken' in response.json():
          # find out if there is a nextPageToken in the returned json
            parameters['pageToken'] = response.json()['nextPageToken']
            time.sleep(15)  # 15-second delay to ensure page refreshes
            page_counter += 1
        else:
            break
    return df


# New York Times article summary collection function
def fetch_free_articles(search_termtoken):
    url = "https://api.nytimes.com/svc/search/v2/articlesearch.json"
    params = {
        "q": search_term,
        "api-key": token,
        "sort""relevance",
        "fl""snippet,lead_paragraph"
    }
    response = requests.get(url, params=params)
    response.raise_for_status()
    return [doc.get("snippet"or doc.get("lead_paragraph")
            for doc in response.json().get("response", {}).get("docs", [])[:30]]
代码 文本

Gemini
# Start the Ollama service to download and use deepseek
def run_ollama_serve():
    subprocess.Popen(["ollama""serve"])


thread = threading.Thread(target=run_ollama_serve)
thread.start()
time.sleep(5)

# Download Deepseek-r1 14B version
try:
    subprocess.run(["ollama""pull""deepseek-r1:14b"], check=True)
except subprocess.CalledProcessError as e:
    print(f"Error pulling ollama model: {e}")
代码 文本

Gemini
# Sentiment analysis function

# Method 1: Lexicon-based sentiment analysis
def lexicon_sentiment_score(text):
    sia = SentimentIntensityAnalyzer()
    sentiment_scores = sia.polarity_scores(text)
    s_score = sentiment_scores['compound']
    # In order to ease the final harmonization of outputs, according to the scores, emotions are determined to be positive or negative
    if s_score > 0:
        return [{'label'"positive"'score': s_score}]
    elif s_score < 0:
        return [{'label'"negative"'score': s_score}]
    else:
        return [{'label'"neutral"'score': s_score}]


# Method 2: Sentiment analysis based on supervised machine learning
def ml_sentiment_score(text):
    MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)
    sentiment_analyzer = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
    # Since the error message showed that the text message was too long in the previous run, the text will be split to an acceptable length, and then the scores will be averaged at the end.
    max_length = 300
    words = text.split()
    results = []
    for i in range(0len(words), max_length):
        chunk = " ".join(words[i:i + max_length])
        result = sentiment_analyzer(chunk)
        results.extend(result)

    # Calculation of average scores
    total_score = 0
    for res in results:
        if res['label'] == 'positive':
            score = res['score']
        elif res['label'] == 'negative':
            score = -res['score']
        else:
            score = 0
        total_score += score

    average_score = total_score / len(results)

    # Final labels based on average scores
    if average_score > 0:
        final_label = 'positive'
    elif average_score < 0:
        final_label = 'negative'
    else:
        final_label = 'neutral'

    final_result = [{'label': final_label, 'score': average_score}]
    return final_result


# Method 3: Sentiment analysis based on large language modeling
def llm_sentiment(text):
    try:
        response = ollama.chat(model='deepseek-r1:14b', messages=[
            {'role''user',
             'content''Produces a sentiment score for the following text. The score ranges from -1 to 1: -1 represents a completely negative sentiment. 0 represents a neutral sentiment. 1 represents a completely positive sentiment. Output the score only. No explanation\nSentiment Score:\n'
             #'content': 'Classify the following review into negative or positive towards a movie. Output only "negative" or "positive". No explanation\nText: ' + text + '\nSentiment:\n'
             },
        ], options={'temperature'0.9,
                    })
        content = response['message']['content']
        # Extract the last number
        import re
        match = re.search(r'[-+]?\d*\.\d+|\d+', content[::-1])
        if match:
            score_str = match.group(0)[::-1]
            return float(score_str)
        else:
            print(f"Error: Could not extract score from response: {content}")
            return None
    except Exception as e:
        print(f"Error in LLM sentiment analysis: {e}")
        return None

# Synthesize sentiment analysis functions to harmonize output results
def comprehensive_sentiment_analysis(text):
    lexicon_result = lexicon_sentiment_score(text)
    ml_result = ml_sentiment_score(text)
    llm_result = llm_sentiment(text)
    return {
        "lexicon_based": lexicon_result,
        "ml_based": ml_result,
        "llm_based": llm_result
        }
代码 文本

Gemini
# Topic Modell Analysis Functions
def topic_model_analysis(textsnum_topics=5):
  # Participles and lexical reduction
    tokenized_texts = []
    for text in texts:
        tokens = word_tokenize(text)
        lemmatized = lemmatize_tokens(tokens)
        tokenized_texts.append(lemmatized)

    # modeling phrases
    phrases = Phrases(tokenized_texts, min_count=3, threshold=10, connector_words=ENGLISH_CONNECTOR_WORDS)
    texts = [phrases[text] for text in tokenized_texts]
    # creating a dictionary
    id2word = corpora.Dictionary(texts)
    # building the corpus
    corpus = [id2word.doc2bow(text) for text in texts]
    # training the LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=id2word,
                                                num_topics=num_topics,
                                                alpha='auto',
                                                per_word_topics=True,
                                                passes=5,
                                                update_every=1)
    # calculating perplexity
    perplexity_lda = lda_model.log_perplexity(corpus)
    # calculating the coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    # visualization
    vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    return {
        "lda_model": lda_model,
        "perplexity": perplexity_lda,
        "coherence": coherence_lda,
        "visualization": vis
    }
代码 文本

Gemini
# collecting data
token = "AIzaSyAHQfG50ZDfhgwwSJK9GL4zKXoxnU3ut-U"
cm = youtube_search_comments(5"iXBtVd7RGHQ", token)
ytb_texts = [clean_text(comment) for comment in cm['textDisplay'].dropna().astype(str)]
ytb_text = ' '.join(ytb_texts)

token2 = "yiINZrC5sIkGLs4WF2r9Rag7uLoAiwRL"
search_term = "romance book"
texts = fetch_free_articles(search_term, token2)
nyt_texts = [clean_text(t) for t in texts if t]
nyt_text = ' '.join(nyt_texts)
代码 文本

Gemini
ytb_texts
显示隐藏的输出项
代码 文本

Gemini
nyt_texts
显示隐藏的输出项
代码 文本

Gemini
# Sentiment analysis
ytb_sentiment = comprehensive_sentiment_analysis(ytb_text)
nyt_sentiment = comprehensive_sentiment_analysis(nyt_text)

print("Sentiment analysis results for YouTube comments:")
print(ytb_sentiment)
print("Sentiment analysis results from the New York Times article summary:")
print(nyt_sentiment)
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Error: Could not extract score from response: <think>
Alright, let's see what I'm dealing with here. The user has provided a long block of text that seems to be a mix of different things—sentences about books, some emojis, and mentions of various book series like "Red Queen," "Shatter," "Daughter of the Pirate King," etc. At first glance, it's a bit overwhelming because there are so many elements packed in.

I notice the user is expressing their love for these books, especially focusing on the sentiment around villains getting the girl, which they seem to appreciate. They mention specific characters like Aaron Warner and Juliette from "Shatter," and also talk about other series like "Red Queen" with Jacks and Evangeline. There are a lot of exclamation marks and emojis, which indicates strong emotions—probably positive ones.

However, there's some frustration expressed too. The user mentions not being able to let their family know how much they're into books because the covers might scream at them or something like that. Also, they talk about dropping "Red Queen" because it didn't hold their interest, which suggests maybe some negative feelings towards that series.

But overall, most of the text is gushing over these book series and characters. They seem to be in a good mood, excited about reading, especially when it comes to stories where villains get happy endings. The use of words like "obsessed," "loved," "good," and emojis like ❤️ and 🎶 adds to the positive sentiment.

So, putting it all together, despite a couple of minor frustrations, the overall tone is very positive and enthusiastic. They're passionate about these books and characters, which is exciting for them.
</think>

The sentiment score for this text is **POSITIVE**. The user expresses strong enthusiasm and affection for various book series, particularly focusing on themes like villains getting the girl, specific characters, and the emotional impact of these stories. While there are some minor frustrations mentioned, the overall tone reflects excitement and love for reading.
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Sentiment analysis results for YouTube comments:
{'lexicon_based': [{'label': 'positive', 'score': 0.9999}], 'ml_based': [{'label': 'positive', 'score': 0.2546060048043728}], 'llm_based': None}
Sentiment analysis results from the New York Times article summary:
{'lexicon_based': [{'label': 'positive', 'score': 0.9819}], 'ml_based': [{'label': 'neutral', 'score': 0.0}], 'llm_based': 0.0}
代码 文本

Gemini
# analysis each comment/text
ytb_llm_scores = []
count = 0
for comment in ytb_texts:
  if count >= 25:
    break
  score = llm_sentiment(comment)
  if score is not None:
    ytb_llm_scores.append(score)
  print(f"[YouTube Comment] Score: {score}")
  count = count + 1
[YouTube Comment] Score: 1.0
[YouTube Comment] Score: 0.2
[YouTube Comment] Score: 0.1
[YouTube Comment] Score: 0.2
[YouTube Comment] Score: 0.0
[YouTube Comment] Score: 0.0
[YouTube Comment] Score: 0.7
[YouTube Comment] Score: 0.5
[YouTube Comment] Score: 0.2
[YouTube Comment] Score: 0.1
[YouTube Comment] Score: 0.2
[YouTube Comment] Score: 0.2
[YouTube Comment] Score: 1.0
[YouTube Comment] Score: 0.6
[YouTube Comment] Score: 0.7
[YouTube Comment] Score: 0.1
[YouTube Comment] Score: 0.2
[YouTube Comment] Score: 0.9
[YouTube Comment] Score: 0.8
[YouTube Comment] Score: 0.1
[YouTube Comment] Score: 0.1
[YouTube Comment] Score: 0.85
[YouTube Comment] Score: 0.0
[YouTube Comment] Score: 0.0
[YouTube Comment] Score: 0.7
代码 文本

Gemini
nyt_llm_scores = []
for text in nyt_texts:
    score = llm_sentiment(text)
    if score is not None:
        nyt_llm_scores.append(score)
    print(f"[NYT Article Text] Score: {score}")
[NYT Article Text] Score: 0.0
[NYT Article Text] Score: 0.1
[NYT Article Text] Score: 0.0
[NYT Article Text] Score: 1.0
[NYT Article Text] Score: 0.85
[NYT Article Text] Score: 1.0
[NYT Article Text] Score: 0.8
[NYT Article Text] Score: 0.75
[NYT Article Text] Score: 0.5
[NYT Article Text] Score: 0.0
代码 文本

Gemini
# Calculate average LLM scores
ytb_llm_avg_score = np.mean(ytb_llm_scores) if ytb_llm_scores else 0
nyt_llm_avg_score = np.mean(nyt_llm_scores) if nyt_llm_scores else 0

# Withdrawal scores
ytb_lexicon_score = ytb_sentiment["lexicon_based"][0]['score']
ytb_ml_score = ytb_sentiment["ml_based"][0]['score']

nyt_lexicon_score = nyt_sentiment["lexicon_based"][0]['score']
nyt_ml_score = nyt_sentiment["ml_based"][0]['score']

# Integrate data for three methods
labels = ['Lexicon''ML''LLM']
ytb_scores = [ytb_lexicon_score, ytb_ml_score, ytb_llm_avg_score]
nyt_scores = [nyt_lexicon_score, nyt_ml_score, nyt_llm_avg_score]
代码 文本

Gemini
# Plot the bar chart
x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, ytb_scores, width, label='YTb')
rects2 = ax.bar(x + width/2, nyt_scores, width, label='NYT')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Scores')
ax.set_title('Comparison of Sentiment Scores for YTb and NYT')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(03),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)

fig.tight_layout()

plt.show()
代码 文本

Gemini
ytb_llm_scores = ytb_llm_scores[:10]
data = {
    "ytb_llm_scores": ytb_llm_scores,
    "nyt_llm_scores": nyt_llm_scores
}
df = pd.DataFrame(data)
try:
    df.to_excel('scores.xlsx', index=False)
    print("save successfully")
except Exception as e:
    print(f"error in saving: {e}")
save successfully
代码 文本

Gemini
x_axis = np.arange(len(ytb_llm_scores))
plt.figure()
plt.plot(x_axis, ytb_llm_scores, label='YTb LLM Scores')
plt.plot(x_axis, nyt_llm_scores, label='NYT LLM Scores')
plt.xlabel('Index')
plt.ylabel('LLM Scores')
plt.title('Comparative specific texts sentiment analysis')
plt.legend()
plt.show()
代码 文本

Gemini
# Before calling topic modeling, check the result of the first text's split
print("Sample Segmentation Results:", word_tokenize(ytb_texts[0]))
# Make sure the output is words, not letters
Sample Segmentation Results: ['book', 'fave', 'also', 'everyone', 'watches', 'subscribes', 'could', 'youtube', 'plaque', 'would', 'dream', 'come', 'true', 'pls', 'subscribe']
代码 文本

Gemini
# Topic modelling analysis
topic_result_ytb = topic_model_analysis(ytb_texts)

print("Topic Modelling Perplexity:", topic_result_ytb["perplexity"])
print("Topic Modelling Coherence Score:", topic_result_ytb["coherence"])
# Visualization topic modeling
pyLDAvis.save_html(topic_result_ytb["visualization"], 'topic_visualization_ytb.html')
pyLDAvis.display(topic_result_ytb["visualization"])
代码 文本

Gemini
topic_result_nyt = topic_model_analysis(nyt_texts)

print("Topic Modelling Perplexity:", topic_result_nyt["perplexity"])
print("Topic Modelling Coherence Score:", topic_result_nyt["coherence"])
pyLDAvis.save_html(topic_result_nyt["visualization"], 'topic_visualization_nyt.html')
pyLDAvis.display(topic_result_nyt["visualization"])
代码 文本

Gemini
text_lists = {
    "ytb_texts": ytb_texts,
    "nyt_texts": nyt_texts
}
for key, value in text_lists.items():
    try:
        with open(f'{key}.txt''w', encoding='utf-8'as file:
            for text in value:
                file.write(text + '\n')
        print(f"{key} is successfully saved")
    except Exception as e:
        print(f"Error in saving {key} to TXT: {e}")
ytb_texts is successfully saved
nyt_texts is successfully saved
代码 文本

Gemini
代码 文本

添加注释